


*-------------------------------------------------------------------------------
* collect contact-card beat numbers
*-------------------------------------------------------------------------------

	/* Note: The first data that we received through FOIA, 
		     which we read in in the following section,
			 did not contain beat numbers. We submitted a second 
			 request explicitly requesting beat numbers and received
			 these files, which include *only* card numbers and 
			 beat numbers; this section collects those. */
			 
* read in from FOIA'd files
set excelxlsxlargefile on

* 2003–13, Sheet 0
import excel "data/inputs/contactcards/19813-P826389-CC_Data-2003-2013.xlsx", ///
	sheet("Export Worksheet") firstrow clear
	tempfile cc0313_0
	save `cc0313_0'
	
* 2003–13, Sheets 1–5
forval i = 1/5 {
	import excel "data/inputs/contactcards/19813-P826389-CC_Data-2003-2013.xlsx", ///
	sheet("Sheet`i'") firstrow clear
	tempfile cc0313_`i'
	save `cc0313_`i''
	}

* 2014–22, Sheet 0	
import excel "data/inputs/contactcards/19813-P826389-CC_Data-2014-2022.xlsx", ///
	sheet("Export Worksheet") firstrow clear
	tempfile cc1422_0
	save `cc1422_0'

* 2014–22, Sheets 1–4
forval i = 1/4 {
	import excel "data/inputs/contactcards/19813-P826389-CC_Data-2014-2022.xlsx", ///
	sheet("Sheet`i'") firstrow clear
	tempfile cc1422_`i'
	save `cc1422_`i''
	}

* append and save
clear all
forval i = 0/4 {	
	append using `cc1422_`i''
	append using `cc0313_`i''
}	
append using `cc0313_5'
renvars, lower

* remove 36 duplicates out of more than 10 million
*sort cardno contactdate district beat
*by cardno: keep if _n == 1

* blanks
replace cardno = trim(cardno)

* contact date
split contactdate
gen double date = date(contactdate1, "DM20Y")
replace date = date(contactdate1, "MDY") if date == .
format date %td
gen month = mofd(date)
format month %tm
assert date != .
gen year = yofd(date)

* truly tiny number of true duplicates (168 of 10.6m)
duplicates drop cardno date, force
assert r(N_drop) < 170

* restrict years
drop if year < 2009

* save
tempfile ccbeats
save `ccbeats'




*-------------------------------------------------------------------------------
* collect contact-card data (no beat numbers)
*-------------------------------------------------------------------------------

* 2014–2018, Part 3	
insheet using "data/inputs/contactcards/19813-P826389-CC-Data-2014-2018p3.csv", ///
	names clear	
renvars, lower
tempfile cc_2014_2018_p3
save `cc_2014_2018_p3'

* 2014–2018, Part 2
insheet using "data/inputs/contactcards/19813-P826389-CC-Data-2014-2018p2.csv", ///
	names clear	
renvars, lower	
tempfile cc_2014_2018_p2
save `cc_2014_2018_p2'
	
* 2014–2018, Part 1
insheet using "data/inputs/contactcards/19813-P826389-CC-Data-2014-2018p1.csv", ///
	names clear		
renvars, lower	
tempfile cc_2014_2018_p1
save `cc_2014_2018_p1'

* 2013
import delimited "data/inputs/contactcards/19813-P826389-CC-Data-2013.csv", delimiter(comma) clear 
tempfile cc2013
save `cc2013'

* 2019–2022, Part 1
import delimited  "data/inputs/contactcards/19813-P826389-CC-Data-2019-2022p1.csv", delimiter(comma) clear
tempfile cc2019p1
save `cc2019p1'

* 2019–2022, Part 2
import delimited   "data/inputs/contactcards/19813-P826389-CC-Data-2019-2022p2.csv", delimiter(comma) clear
tempfile cc2019p2
save `cc2019p2'

* 2019–2022, Part 3
import delimited   "data/inputs/contactcards/19813-P826389-CC-Data-2019-2022p3.csv", delimiter(comma) clear
tempfile cc2019p3
save `cc2019p3'

* 2003–2012
use "data/inputs/contactcards/19508-P797050_CC_data_2003-2012_i_.dta", clear
rename A cardno
rename B contact_date
rename H statutedescription
rename I citationyn
rename J sex
rename K race

* append
append using `cc_2014_2018_p3'
append using `cc_2014_2018_p2'
append using `cc_2014_2018_p1'
*append using `cc2013' /* 2013 is already in the "2003–2012" data */
append using `cc2019p1'
append using `cc2019p2'
append using `cc2019p3'

* year counts
split contact_date
gen double date = date(contact_date1, "DM20Y")
replace date = date(contact_date1, "MDY") if date == .
format date %td
gen month = mofd(date)
format month %tm
assert date != .
gen year = yofd(date)



*-------------------------------------------------------------------------------
* merge with contact-card beat numbers
*-------------------------------------------------------------------------------

* leading and trailing blanks
replace cardno = trim(cardno)


* drop 4K duplicates out of 7.5m
duplicates drop cardno date, force
assert r(N_drop) < 4100

* merge
merge 1:1 cardno date using `ccbeats'

* merge is near-perfect for 2014+
* we don't use beat-specific stop data for pre-2014
tab year _m
drop if _m == 2
drop _m

* assume that we do not have beat info for <= 2013
replace beat = "" if year <= 2013


*-------------------------------------------------------------------------------
* counts
*-------------------------------------------------------------------------------

*pedestrian vs traffic
gen pedestrian = statutedescription == "" 
gen traffic = statutedescription != "" 

/* NOTE: Beginning in 2012, these data stop using the
   ASIAN and HISPANIC categories for traffic stops, and it is 
   clear from the TAB that those stops are now reported (in these data)
   as race MISSING (see tab year race if traffic == 1, m row).
   
   They must still capture those categories somewhere, because they are
   reported to IDOT. But they're not included here. In order to 
   compare with IDOT, then, we create a new category that sums:
   "AHRM" — Asian, Hispanic, and Race Missing.

   */  


* race: traffic
gen traffic_black = (race == "BLACK" & traffic == 1)
gen traffic_AHRM = ((race == "HISPANIC" | regexm(race, "ASIAN") | race == "") ///
	& traffic == 1)
gen traffic_white = ((race == "WHITE" ) & traffic == 1)
gen traffic_AMERIND = ((regexm(race, "AMER") & traffic == 1))


* race: pedestrian
gen pedestrian_black = (regexm(race, "BLACK") & pedestrian == 1) 
	/* NOTE: This includes Black Hispanic, a category used only in 
	2014 and 2015, when it was less than half of one pct of stops */

gen pedestrian_AMERIND = (regexm(race, "AMER") & pedestrian == 1)
gen pedestrian_asian = (regexm(race, "ASIAN") & pedestrian == 1) 
	/* NOTE: It looks as if use of this category changed in 2012 
	(relative to 2011); we are not using data pre-2012 */

gen pedestrian_hispanic = ((race == "HISPANIC" | race == "WHITE HISPANIC") ///
	& pedestrian == 1)
	/* NOTE: "White Hispanic" replaced "Hispanic" in 2014 */

gen pedestrian_white = (race == "WHITE" & pedestrian == 1)	
gen pedestrian_missing = ((race == "UNKNOWN" | race == "") & pedestrian == 1)


*collapse
collapse (sum) pedestrian* traffic*, by(beat month)


* save
save "data/outputs/CC_beat.dta", replace


* End
